/*																		
	Purpose: Clean PSID data for IGE analysis. That is,
			 calculate actual income (1,5, and 10 years) 
			 and predicted income for linked PSID fathers
			 and calculate 1-year actual income for linked
			 adult child respondents aged 30-50.

	Notes: (1) This do-file takes some code from earlier
		       Mazumder work.
		   (2) Output of this file is used to create Table
		       A.2							
	
    Creates: PSIDFatherSONS_IGEanalysis_`x'yrs 
             where x is 1, 5, or 10		
*/

clear 
set more off

cd "$Mydirectory1/1_DataSources/PSID"

****************
*** PSID: FATHERS
****************

use ./RawData/PSID_raw_indfam.dta, clear

* Unique identifier 
	gen father_id = famid*1000 + personnumber
	order father_id, before(famid)
	label var father_id "Father 1968 ID"

/*  Following Mazumder paper, only want certain parts of the PSID: 
    the main nationally representative sample (SRC). 
	The SEO sample (5000<famid<7000) is used in robustness checks. 
	The immigrant samples have values between 3000 and 5000, 
	and Latino families added later have values >7000. 
	
	Source: https://psidonline.isr.umich.edu/guide/faq.aspx   */
	gen src_sample = famid<3000
	gen seo_sample = famid>5000 & famid<7000
	keep if src_sample==1 | seo_sample==1 

* Keep men in 1968 aged 30-50
	preserve

		keep if age1968>=30 & age1968<=50 & sex==1
		keep age1968 grade1968 race1968 indweight1968
		
		foreach var of newlist age grade race indweight {
			rename `var'1968 `var'
		}
		rename indweight weight_psid
		gen black = race==2
		gen sex=1
		
		replace grade=. if grade>=98 | grade==0 | grade==99 //Note: "0" = "inappropriate"	
		gen hs_ed = grade>=12 & grade<.
		gen coll_ed = grade>=16 & grade<.
		
		save ./output/Men30to50_1968.dta, replace	
		
	restore
		
* Locate fathers using downloaded PSID FIMS extract

	merge 1:1 father_id using ./FIMS/FIMSFathers_SRC_SEO.dta
	keep if _merge==3
	drop _merge number

*------------------------------------------------------------*	
*---------------------------*
* Keep extract with fathers
*---------------------------*
	
	preserve
		
		keep father_id grade* race* indweight* indweight_coreimm* age* num*
		
		reshape long grade race indweight indweight_coreimm age num_, i(father_id) j(year)
		drop agehead*
		
		replace indweight = indweight_coreimm if year>=1997 & year<=2015
		drop indweight_coreimm
		replace indweight=. if indweight==0
		
		keep if (age>=30 & age<=50)
		gen sex=1
		
	* Grab weight in first year available
		bysort father_id: gen number_obs= _n
		gen weight=.
		forval i=1(1)21 {
			gen weight_temp = indweight if number_obs==`i'
			bysort father_id: egen weight2 = max(weight_temp)
			replace weight=weight2 if weight==.
			drop weight_temp weight2
		}
		
	* Keep one observation per father 
		bysort father_id: keep if _n==1
		keep if weight<.
		
	* Correct a couple variables
		gen black = race==2	
		replace grade=. if grade>=98 | grade==0 | grade==99 //"0" = "inappropriate" 	
		gen hs_ed = grade>=12 & grade<.
		gen coll_ed = grade>=16 & grade<.
		
		save ./output/Fathers30to50.dta, replace	

	restore	
	
*------------------------------------------------------------*
	
* Reshape
	drop laborinc* seo_sample HHlaborinc* HHunioncontract* intnumber* union* h_w* marital*

	reshape long age agehead relate race grade state selfemployed mainocc mainocc_retro totfaminc ///
			famweight indweight famweight_coreimm indweight_coreimm num_ sequencenumber, i(father_id sex) j(year)
		
* Keep respondents that were heads at least once
	gen father_temp = 0
	replace father_temp = 1 if (relate==1 & year<1983) | (relate==10 & year>=1983)
	bysort father_id: egen father_head = max(father_temp)
	tab father_head
	keep if father_head==1
	
* Clean race variable 
	replace race=. if race>2 & race<8 
	replace race=. if race==0 | race==9
	label var race "Race"
	
* Region
	/*Notes: (1) See https://psidonline.isr.umich.edu/data/Documentation/PSIDStateCodes.pdf 
	             for more info.
	         (2) The South includes Texas, Oklahoma, Arkansas, Louisiana, 
	             Mississippi, Alabama, Tennessee, Kentucky, FL, GA, SC, NC, 
	             VA, MD, DE, DC, and WV */
	tab state, m nol
	gen south_merge = state==42 | state==35 | state==3 | state==17 | state==23 | state==1 | state==41 | state==16 | ///
					  state==9 | state==10 | state==39 | state==32 | state==45 | state==19 | state==7 | state==8 | state==47 if state<.
	tab south_merge, m
	label var south_merge "Living in South"
	
* Education
	replace grade=. if grade>=98 | grade==0 | grade==99 //note: "0" = "inappropriate" 
	
	bysort father_id: egen grade_max = max(grade)
	rename grade_max yrsschool_dad
	label var yrsschool_dad "Maximum years of school for dad"

	* Years of school (binned)
	gen edu_temp=.
	replace edu_temp=1 if yrsschool_dad>=1 & yrsschool_dad<8 //less than grade school
	replace edu_temp=2 if yrsschool_dad==8 //grade school
	replace edu_temp=3 if yrsschool_dad>8 & yrsschool_dad<12 //some HS
	replace edu_temp=4 if yrsschool_dad==12 //HS
	replace edu_temp=5 if yrsschool_dad>12 & yrsschool_dad<16 //some college
	replace edu_temp=6 if yrsschool_dad>=16 & yrsschool_dad<.
	
	bysort father_id: egen edu_dad = max(edu_temp)
	label var edu_dad "Maximum education recorded for dad (in bins)"
	drop edu_temp
	
	tab yrsschool_dad, nol m
	gen edu_dad_bin=.
	replace edu_dad_bin=6 if yrsschool_dad>=1 & yrsschool_dad<=7 
	replace edu_dad_bin=8 if yrsschool_dad==8
	replace edu_dad_bin=10 if yrsschool_dad>=9 & yrsschool_dad<=11
	replace edu_dad_bin=12 if yrsschool_dad==12 
	replace edu_dad_bin=14 if yrsschool_dad>=13 & yrsschool_dad<=15 
	replace edu_dad_bin=16 if yrsschool_dad>=16 & yrsschool_dad<.
	label var edu_dad_bin "Dad years of school, binned"
	
	gen dad_hs_ed = edu_dad>=4 if edu_dad<.
	gen dad_coll_ed = edu_dad>=6 if edu_dad<.
	label var dad_hs_ed "Dad HS educated" 
	label var dad_coll_ed "Dad college educated"
	
* Age

	/* Change agehead to be missing when respondent 
       is not household head. 
       Source of code: http://psidonline.isr.umich.edu/Guide/tutorials/IG/IG.pdf - page 37 */
	replace agehead=. if relate!=1 & year==1968
	replace agehead=. if (relate!=1 | sequencenumber!=1) & (year>=1969 & year<=1982)
	replace agehead=. if (relate!=10 | sequencenumber!=1) & (year>=1983 & year<=2015)

	//Construct consistent age variable
	gen byr_f = year - age
	bysort father_id: egen byr_f2 = min(byr_f) //take minimum of all possible birth years
	drop byr_f
	label var byr_f2 "Birth year (using minimum)"
	gen age_f = year - byr_f2 - 1 
	label var age_f "Age of father"
	replace age_f=. if age_f<=0 | age_f>100
	
	
* Father occupation (current, harmonized across different variables)
	replace mainocc=. if mainocc==0 | mainocc==999
	replace mainocc_retro=. if mainocc_retro==0 | mainocc_retro==999
	
	gen occupation=mainocc if mainocc!=.
	replace occupation=mainocc_retro if (year>=1968 & year<=1973) | (year>1974 & year<=1980)
	replace occupation=mainocc_retro if year==1974 & mainocc==. & mainocc_retro!=.
	label var occupation "Occupation, 1970 census"

* Self-employed 
	gen selfemp=.
	replace selfemp=0 if selfemployed==1 | selfemployed==2
	replace selfemp=1 if selfemployed==3
	label var selfemp "Self employed"

* Crosswalk PSID occupations (1970 Census-based) to coarsened ANES occupations		
	gen census1970 = occupation
	merge m:1 census1970 using ../Crosswalks/Crosswalk_1970Census_toANES.dta
	tab census1970 if _merge==1 & year<2002, m
	assert census1970==. if _merge==1 & year<2002
	drop if _merge==2
	drop _merge
	
	label var fatheroccej "IGE occupation, using crosswalk"
	
/* The PSID used 1970 Census occupations until 2000. 
   (Temporarily) code father occupation in subsequent 
   years as missing. */
	replace fatheroccej=. if year>=2002
	
* Fix self-employed workers
	replace fatheroccej=21 if (mainocc>=201 & mainocc<=246) & selfemp==1
	
* Crosswalk PSID occupations (2000 Census-based) to coarsened ANES occupations		
	clonevar occ2000=occupation
	replace occ2000=. if year<2002
	
	merge m:1 occ2000 using ../Crosswalks/Crosswalk_2000Census_toANES.dta
	tab occ2000 if year>2002 & _merge==1, m
	assert occ2000==. if _merge==1 & year>2002
	drop if _merge==2
	drop _merge
	
	replace fatheroccej_2000=21 if fatheroccej_2000==28 & selfemp==1
	replace fatheroccej = fatheroccej_2000 if year>=2002
	
	drop fatheroccej_2000
	
	tab fatheroccej if age_f>=30 & age_f<=50, m
	
	
* Predicted father income (i.e., "income scores")
	//occ x race x south level
	merge m:1 fatheroccej race south_merge using ../CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta
	assert fatheroccej==. | race==. | south_merge==. if _merge==1
	drop if _merge==2
	tab fatheroccej if _merge==1, m
	drop _merge
	
	//occupation-only level
	merge m:1 fatheroccej using ../CensusData/output/IncomeScores_Coarsened_all.dta
	assert fatheroccej==. if _merge==1
	drop if _merge==2
	tab fatheroccej if _merge==1, m
	drop _merge	
	
* Keep relevant variables 
	keep father_id-occ2000 avg_HHinc_1970_byocc_byr_bys avg_HHinc_1970_byocc

* Actual father income (1-,5-, and 10- year average)

	//Convert family income to 1950 dollars
	gen year_CPI = year-1 //total family income is observed in year prior to survey
	
	merge m:1 year_CPI using ../CPI/CPI_deflator.dta
	drop if _merge==2
	drop _merge
	
	replace totfaminc = totfaminc * deflator
	
	//Fix income variable
	sum totfaminc, d
	replace totfaminc=. if totfaminc<=0 
	sum totfaminc, d
	
	*---------------------------*
	/* MAZUMDER APPROACH TO 
	   FINDING ACTUAL INCOME */
	*---------------------------*
	sort father_id year

	rename avg_HHinc_1970_byocc_byr_bys HHincome 
	rename avg_HHinc_1970_byocc income_occ  

	replace HHincome=. if totfaminc==.
	replace income_occ=. if totfaminc==.

	global numberyears "1 5 10" 
	foreach c in $numberyears {
		
		preserve 
		
		* Center around age 40
		local father_center=40
		local center `father_center'

		* Look between 30 and 50
		local father_band=10 
		local band `father_band'

      * # of observations that'll be tagged (changes from 1 to 5 to 10)
		local fcount=`c' 
		local obs_ct `fcount'

		foreach x of varlist HHincome income_occ  {

	      * Binary: data is okay to use (i.e., there's non-missing income at age 40)
			gen indfather_`x' = 0
			replace indfather_`x' = 1 if `x'!=. & age_f==`center'
				
	      * Find total of this binary for each person
			by father_id: egen total_indf_`x' = total(indfather_`x')

	    /* Now iteratively search the bands starting at the center. 
	       Tag the observation in the sample if it's non-missing and 
	       the respondent has not reached the the observation count (1,5, or 10). 
	       The total number of observations used is then recalculated. 
	       Note that the upper band is arbitrarily privileged. 
	    */  
			forval i = 0/`band' {
				* Upper band (41-50)
				replace indfather_`x' = 1 if `x'!=. & age_f==(`center'+`i') & (total_indf_`x'<`obs_ct')
				
            * Re-calculate the total observations used
				drop total_indf_`x'
				by father_id: egen total_indf_`x' = total(indfather_`x') 

            * Lower band (30-39)
				replace indfather_`x' = 1 if `x'!=. & age_f==(`center'-`i') & total_indf_`x'<`obs_ct'
				
            * Re-calculate the total observations used
				drop total_indf_`x'
				by father_id: egen total_indf_`x' = total(indfather_`x') 
			}

    		* Calculate average income (1,5, or 10 years) using tagged years 
			by father_id: egen mean_fam_inc_`x'_tmp = mean(`x') if indfather_`x'==1
			by father_id: egen mean_fam_inc_`x' = min(mean_fam_inc_`x'_tmp)
			label var mean_fam_inc_`x' "Father's avg. occ. income using `c' years and `x'"

		}

	//Calculate mean *actual* total family income
		by father_id: egen mean_father_totfaminc_tmp = mean(totfaminc) if indfather_income==1
		by father_id: egen mean_father_totfaminc = min(mean_father_totfaminc_tmp)
		label var mean_father_totfaminc "Father's avg. actual income using `c' years"
		
	//Grab modal occupation 
		by father_id: egen mode_fatherocc_tmp = mode(fatheroccej) if indfather_income==1, minmode
		by father_id: egen mode_fatherocc = min(mode_fatherocc_tmp)

		bysort father_id indfather_income fatheroccej: gen test=1 if _n==1 
		bysort father_id indfather_income : egen test2 = sum(test) if indfather_income==1 
		bysort father_id: egen number_occs = min(test2)
		
	//Calculate maximum year of tagged income
		by father_id: egen max_father_year_tmp = max(year) if indfather_income==1
		by father_id: egen max_father_year = min(max_father_year_tmp)
		label var max_father_year "Father's max year"


	* Save
			
		//Whether father is a farmer
			gen fatherfarm=0
			replace fatherfarm=. if mode_fatherocc==.
			replace fatherfarm=1 if mode_fatherocc==71 | mode_fatherocc==81

		//Keep one observation per father
			sort father_id
			by father_id: keep if _n==1

		//Drop observations w/o enough years of income
			drop if total_indf_income<`obs_ct'

		//Drop observations w/o any available income measures
			drop if mean_fam_inc_HHincome ==. |  mean_father_totfaminc==. | mean_fam_inc_income_occ==.

		//Trim
			drop *_tmp
			keep famid father_id mean_fam_inc* mean_father_totfaminc* mode* fatherocc* fatherfarm number_occs max_father_year edu_dad yrsschool_dad edu_dad_bin dad_hs_ed dad_coll_ed

			tempfile PSIDFathers_`c'
			save `PSIDFathers_`c''
			restore
	}

*------------------------------------------------------------------------------* 
*------------------------------------------------------------------------------*

****************************
*** PSID (ADULT CHILDREN)
****************************

use ./RawData/PSID_raw_indfam.dta, clear

* Unique identifier
	gen son_id = famid*1000 + personnumber
	order son_id, before(famid)
	label var son_id "Child 1968 ID"
	
* Merge in employment variables (1979-2015)
	merge 1:1 son_id using "./RawData/PSID_raw_indfam_empvar.dta"
	drop _merge

* Merge in geographic variables
	merge 1:1 son_id using "./RawData/geopsid.dta"
	drop if _merge==2 
	drop _merge

/*  Following Mazumder paper, only want certain parts of the PSID: 
    the main nationally representative sample (SRC). 
	The SEO sample (5000<famid<7000) is used in robustness checks. 
	The immigrant samples have values between 3000 and 5000, 
	and Latino families added later have values >7000. 
	
	Source: https://psidonline.isr.umich.edu/guide/faq.aspx   */
	gen src_sample = famid<3000
	gen seo_sample = famid>5000 & famid<7000
	keep if src_sample==1 | seo_sample==1 

/* PSID FIMS sample of respondents who 
   can be linked to a father */
   	merge 1:1 son_id using ./FIMS/FIMSFathersKids_SRC_SEO.dta
	keep if _merge==3
	drop _merge
	order father_id, after(son_id)
		
* Reshape
	drop laborinc* seo_sample HHlaborinc* HHunioncontract* intnumber*  
	#delimit ; 
	reshape long age maritalstatus agehead relate race grade state union unions mainocc mainocc_retro selfemployed
			 h_w_inc_ h_w_acc_ totfaminc empstatus head foreignborn fatherforeign moved_state 
			 moved_region state_childhood region4_childhood famweight indweight famweight_coreimm 
			 indweight_coreimm num_ sequencenumber, i(son_id father_id sex) j(year);
	#delimit cr
	drop ER* V*
	sort son_id year
	
/* Save an extract of father id codes
   for fathers linked to adult children 
   in the preferred age range (30-50)  */
preserve 
	keep age son_id sex father_id
	keep if age>=30 & age<=50

	bysort father_id: keep if _n==1
	keep father_id 
	
	save ./output/Fathers_son30to50_inPSID.dta, replace
	
restore
	
/* Keep adult children that had "child" listed as the 
   relationship to the head at least once from 1968-2015. */
	gen son_tmp = 0
	replace son_tmp = 1 if (relate==3 & year<1983) | (relate==30 & year>=1983)
	by son_id: egen son_present = max(son_tmp)
	tab son_present
	label var son_present "Individual labeled as child at some point"
	keep if son_present==1

* Weight: harmonize individual weight variables
	replace indweight = indweight_coreimm if year>=1997 & year<=2015
	
****************
*** Demographic variables
****************

	replace grade=. if grade>=98 | grade==0 //note: "0" = "inappropriate"
	
	bysort son_id: egen yrsschool = max(grade)
	
* Education
	gen eduR=.
	replace eduR=1 if yrsschool>=1 &  yrsschool<8 //<grade school
	replace eduR=2 if yrsschool==8 //grade school
	replace eduR=3 if yrsschool>8 & yrsschool<12 //some HS
	replace eduR=4 if yrsschool==12 //HS
	replace eduR=5 if yrsschool>12 & yrsschool<16 //some college
	replace eduR=6 if yrsschool>=16 & yrsschool<18
	label var eduR "Education of child (in year at which income was selected)"
	
	//Years of schooling (binned)
	tab yrsschool, nol m
	gen yrsschool_bin=.
	replace yrsschool_bin=6 if yrsschool>=1 & yrsschool<=7 
	replace yrsschool_bin=8 if yrsschool==8
	replace yrsschool_bin=10 if yrsschool>=9 & yrsschool<=11
	replace yrsschool_bin=12 if yrsschool==12 
	replace yrsschool_bin=14 if yrsschool>=13 & yrsschool<=15 
	replace yrsschool_bin=16 if yrsschool>=16 & yrsschool<.
	label var yrsschool_bin "Years of school, binned"
	tab yrsschool_bin, m
	
	gen hs_ed = eduR>=4 if eduR<.
	gen coll_ed = eduR>=6 if eduR<.
	label var hs_ed "HS educated" 
	label var coll_ed "College educated"

* Clean race variable
	replace race=3 if race>2 & race<8 
	replace race=. if race==0 | race==9
	rename race race_temp
	
* Grab most commonly reported race
	bysort son_id: egen race = mode(race_temp), minmode
	label var race "Race"
	
* Age

	/* Change agehead to be missing when respondent 
       is not household head. 
       Source of code: http://psidonline.isr.umich.edu/Guide/tutorials/IG/IG.pdf - page 37 */
	replace agehead=. if relate!=1 & year==1968
	replace agehead=. if (relate!=1 | sequencenumber!=1) & (year>=1969 & year<=1982)
	replace agehead=. if (relate!=10 | sequencenumber!=1) & (year>=1983 & year<=2015)

	//Construct consistent age variable
	replace age=. if age==0 | age==999
	gen byr_s = year - age
	by son_id: egen byr_s2 = min(byr_s) //take minimum of all possible birth years
	drop byr_s
	label var byr_s2 "Birth year (using minimum)"
	gen age_s = year - byr_s2 - 1 
	replace age_s=. if age_s<=0 | age_s>100
	label var age_s "Age of child"

*----------------------------------------------------*
/* Save an extract of adult children aged 30-50
   + no income restriction */

	preserve
	
	keep age_s son_id indweight yrsschool race sex father_id
	keep if age_s>=30 & age_s<=50
	
	//Grab weight in first year available
	replace indweight=. if indweight==0
	bysort son_id: gen number_obs= _n
	gen weight=.
	forval i=1(1)21 {
		gen weight_temp = indweight if number_obs==`i'
		bysort son_id: egen weight2 = max(weight_temp)
		replace weight=weight2 if weight==.
		drop weight_temp weight2
	}
	
	bysort son_id: keep if _n==1
	keep if weight<.
	drop indweight
	
	gen hs_ed = yrsschool>=12 & yrsschool<.
	gen coll_ed = yrsschool>=16 & yrsschool<.
	gen black = race==2
	
	save ./output/Children_nofather_noincome_restrictions.dta, replace

	restore
*----------------------------------*

* Actual income around age 40 
	
	//Fix income variable
	replace totfaminc=. if totfaminc<=0
	label var totfaminc "Total family income"
 
    *---------------------------*
    /* MAZUMDER APPROACH TO 
       FINDING ACTUAL INCOME */
    *---------------------------*
    
    * Center around age 40
	local son_center=40
	local center `son_center'

    * Look between 30 and 50
	local son_band=10
	local band `son_band'

    * # of observations that'll be tagged (1)
	local son_count=1 
	local obs_ct `son_count'
	
	foreach var of varlist totfaminc {

        * Binary: data is okay to use (i.e., there's non-missing income at age 40)
		gen indson_`var' = 0
		replace indson_`var' = 1 if `var'!=. & age_s==`center'
		
        * Find total of this binary for each person
		bysort son_id: egen total_indson_`var' = total(indson_`var')

        /* Now iteratively search the bands starting at the center. 
           Tag the observation in the sample if it's non-missing and 
           the respondent has not reached the the observation count (1,5, or 10). 
           The total number of observations used is then recalculated. 
           Note that the upper band is arbitrarily privileged. 
        */
        forval i = 0/`band' {
            * Upper band (41-50)
				replace indson_`var' = 1 if `var'!=. & age_s==(`center'+`i') & (total_indson_`var'<`obs_ct')
			
            * Re-calculate the total observations used
				drop total_indson_`var'
				by son_id: egen total_indson_`var' = total(indson_`var') 

            * Lower band (30-39)
				replace indson_`var' = 1 if `var'!=. & age_s==(`center'-`i') & total_indson_`var'<`obs_ct'
			
            * Re-calculate the total observations used
				drop total_indson_`var'
				by son_id: egen total_indson_`var' = total(indson_`var') 
		}

		* Drop observations without at least one year of viable income 
		drop if total_indson_`var'<`obs_ct'

		* Grab the tagged year of income and give it to the respondent in every year 
		gen chosen_`var' = `var' if indson_`var'==1
		by son_id: egen child_`var' = min(chosen_`var')
		label var child_`var' "`var' chosen around age 40 for child"

		* Grab the age of the tagged observation and give it to the respondent in every year
		gen chosen_age_`var' = age_s if indson_`var'==1
		by son_id: egen age_child_`var' = min( chosen_age_`var' )
		label var age_child_`var' "Age at which `var' chosen around age 40 for child"
		
		* Grab the survey year of the tagged observation and give it to the respondent in every year
		gen chosen_year_`var' = year if indson_`var'==1
		by son_id: egen year_child_`var' = min( chosen_year_`var' )
		label var year_child_`var' "Year at which `var' chosen around age 40 for child"

		* Grab the weight of the tagged observation and give it to the respondent in every year
		gen chosen_weight_`var' = indweight if indson_`var'==1
		by son_id: egen weight_`var' = min(chosen_weight_`var')
		label var weight_`var' "Weight at which `var' chosen around age 40 for child" 
	}

* Keep one observation per adult child
	sort son_id year
	by son_id: keep if _n==1

* Restrict to native born 
	keep if (foreignborn==0 | foreignborn==.)

* Verify: average income is not missing
	assert child_totfaminc!=. & weight_totfaminc!=.
		
***********************************************
*** PUT INCOMES IN BINS (FOLLOWING GSS BINS)
***********************************************

	foreach var of varlist totfaminc {

		gen income77_bins_`var'=.
		replace income77_bins_`var'=750 if child_`var'>=0 & child_`var'<1000 //<1000
		replace income77_bins_`var'=2000 if child_`var'>=1000 & child_`var'<3000 //1-3k
		replace income77_bins_`var'=4000 if child_`var'>=3000 & child_`var'<5000 //3-5k
		replace income77_bins_`var'=6000 if child_`var'>=5000 & child_`var'<7000 //5-7
		replace income77_bins_`var'=8500 if child_`var'>=7000 & child_`var'<10000 //7-10
		replace income77_bins_`var'=12500 if child_`var'>=10000 & child_`var'<15000 //10-15
		replace income77_bins_`var'=17500 if child_`var'>=15000 & child_`var'<20000 //15-20
		replace income77_bins_`var'=22500 if child_`var'>=20000 & child_`var'<25000 //20-25
		replace income77_bins_`var'=37500 if child_`var'>=25000 & child_`var'<50000 //25-50 
		replace income77_bins_`var'=1.25*50000 if child_`var'>=50000 & child_`var'<. //50+ 
		replace income77_bins_`var'=. if year_child_`var'>=1982

		gen income82_bins_`var'=.
		replace income82_bins_`var'=750 if child_`var'>=0 & child_`var'<1000 //less than 1000
		replace income82_bins_`var'=2000 if child_`var'>=1000 & child_`var'<3000 //1-3k
		replace income82_bins_`var'=4000 if child_`var'>=3000 & child_`var'<5000 //3-5k
		replace income82_bins_`var'=6000 if child_`var'>=5000 & child_`var'<7000 //5-7
		replace income82_bins_`var'=8500 if child_`var'>=7000 & child_`var'<10000 //7-10
		replace income82_bins_`var'=12500 if child_`var'>=10000 & child_`var'<15000 //10-15
		replace income82_bins_`var'=17500 if child_`var'>=15000 & child_`var'<20000 //15-20
		replace income82_bins_`var'=22500 if child_`var'>=20000 & child_`var'<25000 //20-25
		replace income82_bins_`var'=30000 if child_`var'>=25000 & child_`var'<35000 //25-35 
		replace income82_bins_`var'=42500 if child_`var'>=35000 & child_`var'<50000 //35-50 
		replace income82_bins_`var'=1.25*50000 if child_`var'>=50000 & child_`var'<. //50+ 
		replace income82_bins_`var'=. if year_child_`var'<1982 | year_child_`var'>=1986

		gen income86_bins_`var'=.
		replace income86_bins_`var'=0.75*4000 if child_`var'>=0 & child_`var'<4000 //less than 4k
		replace income86_bins_`var'=5000 if child_`var'>=4000 & child_`var'<6000 //4-6k
		replace income86_bins_`var'=7000 if child_`var'>=6000 & child_`var'<8000 //6-8k
		replace income86_bins_`var'=10250 if child_`var'>=8000 & child_`var'<12500 //8-12.5
		replace income86_bins_`var'=15000 if child_`var'>=12500 & child_`var'<17500 //12.5-17.5
		replace income86_bins_`var'=20000 if child_`var'>=17500 & child_`var'<22500 //17.5-22.5
		replace income86_bins_`var'=26250 if child_`var'>=22500 & child_`var'<30000 //22.5-30
		replace income86_bins_`var'=35000 if child_`var'>=30000 & child_`var'<40000 //30-40
		replace income86_bins_`var'=50000 if child_`var'>=40000 & child_`var'<60000 //40-60 
		replace income86_bins_`var'=1.25*60000 if child_`var'>=60000 & child_`var'<. //60+ 
		replace income86_bins_`var'=. if year_child_`var'<1986 | year_child_`var'>=1991
		
		gen income91_bins_`var'=.
		replace income91_bins_`var'=0.75*5000 if child_`var'>=0 & child_`var'<6000 //less than 5k
		replace income91_bins_`var'=6000 if child_`var'>=6000 & child_`var'<10000 //5-7k
		replace income91_bins_`var'=8500 if child_`var'>=6000 & child_`var'<10000 //7-10k
		replace income91_bins_`var'=12500 if child_`var'>=10000 & child_`var'<15000 //10-15k
		replace income91_bins_`var'=17500 if child_`var'>=15000 & child_`var'<20000 //15-20
		replace income91_bins_`var'=22500 if child_`var'>=20000 & child_`var'<25000 //20-25
		replace income91_bins_`var'=30000 if child_`var'>=25000 & child_`var'<35000 //25-35
		replace income91_bins_`var'=42500 if child_`var'>=35000 & child_`var'<50000 //35-50
		replace income91_bins_`var'=62500 if child_`var'>=50000 & child_`var'<75000 //50-75 
		replace income91_bins_`var'=1.25*75000 if child_`var'>=75000 & child_`var'<. //75+ 
		replace income91_bins_`var'=. if year_child_`var'<1991 | year_child_`var'>=1998 

		gen income98_bins_`var'=.
		replace income98_bins_`var'=0.75*6000 if child_`var'>=0 & child_`var'<6000 //less than 6k
		replace income98_bins_`var'=8000 if child_`var'>=6000 & child_`var'<10000 //6-10k
		replace income98_bins_`var'=12500 if child_`var'>=10000 & child_`var'<15000 //10-15k
		replace income98_bins_`var'=17500 if child_`var'>=15000 & child_`var'<20000 //15-20
		replace income98_bins_`var'=22500 if child_`var'>=20000 & child_`var'<25000 //20-25
		replace income98_bins_`var'=30000 if child_`var'>=25000 & child_`var'<35000 //25-35
		replace income98_bins_`var'=42500 if child_`var'>=35000 & child_`var'<50000 //35-50
		replace income98_bins_`var'=62500 if child_`var'>=50000 & child_`var'<75000 //50-75 
		replace income98_bins_`var'=92500 if child_`var'>=75000 & child_`var'<110000 //75-110 
		replace income98_bins_`var'=1.25*110000 if child_`var'>=110000 & child_`var'<. //110+ 
		replace income98_bins_`var'=. if year_child_`var'<1998 | year_child_`var'>=2006

		gen income06_bins_`var'=.
		replace income06_bins_`var'=0.75*10000 if child_`var'>=0 & child_`var'<10000 //less than 10k
		replace income06_bins_`var'=12500 if child_`var'>=10000 & child_`var'<15000 //10-15k
		replace income06_bins_`var'=17500 if child_`var'>=15000 & child_`var'<20000 //15-20
		replace income06_bins_`var'=22500 if child_`var'>=20000 & child_`var'<25000 //20-25
		replace income06_bins_`var'=30000 if child_`var'>=25000 & child_`var'<35000 //25-35
		replace income06_bins_`var'=42500 if child_`var'>=35000 & child_`var'<50000 //35-50
		replace income06_bins_`var'=62500 if child_`var'>=50000 & child_`var'<75000 //50-75 
		replace income06_bins_`var'=92500 if child_`var'>=75000 & child_`var'<110000 //75-110 
		replace income06_bins_`var'=130000 if child_`var'>=110000 & child_`var'<150000 //110-150 
		replace income06_bins_`var'=1.25*150000 if child_`var'>=150000 & child_`var'<. //150k+
		replace income06_bins_`var'=. if year_child_`var'<2006 | year_child_`var'>=2015
		
		gen income16_bins_`var'=.
		replace income16_bins_`var'=0.75*10000 if child_`var'>=0 & child_`var'<10000 //less than 10k
		replace income16_bins_`var'=12500 if child_`var'>=10000 & child_`var'<15000 //10-15k
		replace income16_bins_`var'=17500 if child_`var'>=15000 & child_`var'<20000 //15-20
		replace income16_bins_`var'=22500 if child_`var'>=20000 & child_`var'<25000 //20-25
		replace income16_bins_`var'=30000 if child_`var'>=25000 & child_`var'<35000 //25-35
		replace income16_bins_`var'=42500 if child_`var'>=35000 & child_`var'<50000 //35-50
		replace income16_bins_`var'=62500 if child_`var'>=50000 & child_`var'<75000 //50-75 
		replace income16_bins_`var'=92500 if child_`var'>=75000 & child_`var'<110000 //75-110 
		replace income16_bins_`var'=130000 if child_`var'>=110000 & child_`var'<150000 //110-150
		replace income16_bins_`var'=160000 if child_`var'>=150000 & child_`var'<170000 //110-150 
		replace income16_bins_`var'=1.25*170000 if child_`var'>=170000 & child_`var'<. //150k+
		replace income16_bins_`var'=. if year_child_`var'<2015
		
	* Create 1 harmonized, binned measure (based on survey year)
		gen child_`var'_bucket=.
		replace child_`var'_bucket=income77_bins_`var' if year_child_`var'<1982
		replace child_`var'_bucket=income82_bins_`var' if year_child_`var'>=1982 & year_child_`var'<=1985
		replace child_`var'_bucket=income86_bins_`var' if year_child_`var'>=1986 & year_child_`var'<=1990 
		replace child_`var'_bucket=income91_bins_`var' if year_child_`var'>=1991 & year_child_`var'<=1997
		replace child_`var'_bucket=income98_bins_`var' if year_child_`var'>=1998 & year_child_`var'<=2005
		replace child_`var'_bucket=income06_bins_`var' if year_child_`var'>=2006 & year_child_`var'<=2013
		replace child_`var'_bucket=income16_bins_`var' if year_child_`var'==2015
		label var child_`var'_bucket "Income (`var') of child, in GSS buckets"
		
		gen bottomcoded_son=0
		replace bottomcoded_son=1 if income77_bins_`var'==750 & year_child_`var'<1982
		replace bottomcoded_son=1 if income82_bins_`var'==750 & (year_child_`var'>=1982 & year_child_`var'<=1985)
		replace bottomcoded_son=1 if income86_bins_`var'==2000 & (year_child_`var'>=1986 & year_child_`var'<=1990)
		replace bottomcoded_son=1 if income91_bins_`var'==2500 & (year_child_`var'>=1991 & year_child_`var'<=1997)
		replace bottomcoded_son=1 if income98_bins_`var'==3000 & (year_child_`var'>=1998 & year_child_`var'<=2005)
		replace bottomcoded_son=1 if income06_bins_`var'==5000 & (year_child_`var'>=2006 & year_child_`var'<=2013)
		replace bottomcoded_son=1 if income16_bins_`var'==5000 & year_child_`var'==2015
		
		gen topcoded_son=0
		replace topcoded_son=1 if income77_bins_`var'==1.25*50000 & year_child_`var'<1982
		replace topcoded_son=1 if income82_bins_`var'==1.25*50000 & (year_child_`var'>=1982 & year_child_`var'<=1985)
		replace topcoded_son=1 if income86_bins_`var'==1.25*60000 & (year_child_`var'>=1986 & year_child_`var'<=1990)
		replace topcoded_son=1 if income91_bins_`var'==1.25*75000 & (year_child_`var'>=1991 & year_child_`var'<=1997)
		replace topcoded_son=1 if income98_bins_`var'==1.25*110000 & (year_child_`var'>=1998 & year_child_`var'<=2005)
		replace topcoded_son=1 if income06_bins_`var'==1.25*150000 & (year_child_`var'>=2006 & year_child_`var'<=2013)
		replace topcoded_son=1 if income16_bins_`var'==1.25*170000 & year_child_`var'==2015
		
	}

	* Convert binned income to 1950$
		gen year_CPI = year_child_totfaminc - 1
		merge m:1 year_CPI using ../CPI/CPI_deflator.dta
		keep if _merge==3
		drop _merge
		
		gen fam_inc_real = child_totfaminc_bucket * deflator
		label var fam_inc_real "Child's family income (binned), in 1950 dollars"
		
		gen fam_inc_nobin_real = child_totfaminc * deflator
		label var fam_inc_nobin_real "Child's family income (not binned), in 1950 dollars"

	* Trim and save tempfile
		drop *_tmp
		keep famid son_id father_id sex race byr_s2 child* weight* age_child* year_child* edu* yrsschool* hs_ed coll_ed deflator ///
		fam_inc_real fam_inc_nobin_real topcoded bottomcoded 
		

		tempfile PSIDSons
		save `PSIDSons'
		
	* Save data extract of these respondents (no linked fathers)
		preserve

			gen black = race==2
			rename age_child_totfaminc age
			rename weight_totfaminc weight_psid
			
			save ./output/PSIDFatherSONS_IGEanalysis_0yrs.dta, replace

		restore

*------------------------------------------------------------------------------* 
*------------------------------------------------------------------------------*

***********************
*** MERGE WITH FATHERS
***********************

	global numberyears "1 5 10" 
	foreach c in $numberyears {

	preserve
		
		merge m:1 father_id using `PSIDFathers_`c''
		keep if _merge==3
		drop _merge
		
	* Log incomes

		//fathers
		gen log_father_baseline = ln(mean_fam_inc_HHincome)
		label var log_father_baseline "Logged father's household income"
		gen log_father_occscore = ln(mean_fam_inc_income_occ)
		label var log_father_occscore  "Logged father's occscore"
		gen log_father_actual_inc = ln(mean_father_totfaminc)
		label var log_father_actual_inc  "Logged (1950) father's actual income"
		
		//adult children
		gen log_son_baseline = ln(fam_inc_real)
		label var log_son_baseline "Logged (1950) child's family income, binned"
		
		gen log_son_baseline_nobin = ln(fam_inc_nobin_real)
		label var log_son_baseline_nobin "Logged (1950) child's family income, no bin"
	

	*Birth cohorts		
		//Assign everyone the decade in which they were born
		tab byr_s2, m
		gen decade=.
		replace decade=1920 if byr_s2>=1920 & byr_s2<=1929
		replace decade=1930 if byr_s2>=1930 & byr_s2<=1939
		replace decade=1940 if byr_s2>=1940 & byr_s2<=1949
		replace decade=1950 if byr_s2>=1950 & byr_s2<=1959
		replace decade=1960 if byr_s2>=1960 & byr_s2<=1969
		replace decade=1970 if byr_s2>=1970 & byr_s2<=1979
		replace decade=1980 if byr_s2>=1980 & byr_s2<=1989
		label var decade "Decade of birth"

		//Generate dummies for each decade
		tab decade, gen(decade_)
		
	* Age^2
		foreach var of newlist totfaminc {
			gen age2_`var' = age_child_`var' ^2
			label var age2_`var' "Age squared, `var'"
		}

		rename byr_s2 dob
		
	* Rename some variables
		gen female = sex==2
		label var female "Female "

		gen black = race==2
		label var black "Black"

		gen age = age_child_totfaminc
		label var age "Age of child"

		gen agesq = age2_totfaminc
		label var agesq "Age sq. of child"

		gen year = year_child_totfaminc
		label var year "Year at which income was chosen"

	* Unique identifier
		gen id_psid = son_id
		label var id_psid "R ID (copy of son_id)"
		
	* Compress and save
		compress
		ren weight_totfaminc weight_psid

		duplicates report id_psid //no duplicates
		sort id_psid
		order id_psid weight_psid
		save ./output/PSIDFatherSons_IGEanalysis_`c'yrs.dta, replace
		
	restore
	}

